from pathlib import Path
from IPython.display import display
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
import matplotlib.pyplot as plt
pd.set_option('max_columns', None)
df = pd.read_csv(Path(r'private/data.csv'), sep='\t')
Let's retrieve the most useful column for our analysis like:
col = [
'added_at',
'album.id',
'album.images.0.height',
'album.images.0.url',
'album.images.0.width',
'album.name',
'album.release_date',
'album.release_date_precision',
'artists.id',
'artists.name',
'duration_min',
'id',
'external_urls.spotify',
'name',
'popularity',
'preview_url',
'danceability',
'energy',
'key',
'loudness',
'mode',
'speechiness',
'acousticness',
'instrumentalness',
'liveness',
'valence',
'tempo',
'time_signature'
]
feat = [
'danceability',
'energy',
'key',
'loudness',
'speechiness',
'acousticness',
'instrumentalness',
'liveness',
'valence',
'tempo',
]
Let's have a general view of the kind of music I listen to:
df[feat+['duration_min', 'popularity']].hist(sharey=True, figsize=(16, 10));
There is 10 audio features given by Spotify, to view our songs in 1 view, I will apply a dimensionnality reduction algorithm to project in a 2D plane. After that I will cluster on this plane. Then we can see if the clustering make sense...
from umap import UMAP
mdl_proj = UMAP(
n_neighbors=100,
metric='mahalanobis',
verbose=1,
random_state=0
)
df[['proj_x', 'proj_y']] = pd.DataFrame(mdl_proj.fit_transform(df[feat]))
df[['proj_x', 'proj_y']].plot(kind='scatter', x='proj_x', y='proj_y');
Let's see a bit where the supergenre calculated previously are in this projection
g = sns.scatterplot(
data=df,
x='proj_x',
y='proj_y',
hue='artists.supergenre_1',
legend='full',
cmap=sns.color_palette('husl'),
edgecolor=None
);
g.legend(loc='center right', bbox_to_anchor=(1.5, 0.5), ncol=1);
This is expected, and even as intended, I am not focusing on a music genre, but on a music audio, for example a group of "sad" songs of multilple genres...
I fine-tuned HDBSCAN with the min_cluster_size and let min_samples be found min_cluster_size should be similar to an album size (10-20) or a playlist size (50-100) the size of 16 seems the magic (other size around does not give similar clusters number and size)
from hdbscan import HDBSCAN
df['clusters'] = HDBSCAN(min_cluster_size=10, min_samples=20)\
.fit_predict(df[['proj_x', 'proj_y']], min)
df['clusters'] = df['clusters'].map(lambda x: f'c{x}')
plt.figure()
df['clusters'].value_counts().plot(kind='barh');
plt.figure()
g = sns.scatterplot(
data=df,
x='proj_x',
y='proj_y',
hue='clusters',
legend='full',
cmap=sns.color_palette('husl'),
edgecolor=None
);
g.legend(loc='center right', bbox_to_anchor=(1.5, 0.5), ncol=1);
Let's see if those clusters make sense by printing each 10 most popular songs
# count of most popular songs to analyse
top_n = 3
col = feat+[
'clusters',
'proj_x',
'proj_y',
'popularity',
'duration_min',
'name',
'artists.name',
'album.name',
'artists.genres',
'artists.supergenres',
'artists.supergenre_1',
'external_urls.spotify',
'preview_url'
]
df_top = df[col].groupby('clusters').apply(
lambda x: x.sort_values('popularity', ascending=False).head(top_n)
)
df_top.sample(5)
An interactive plot with px would help see where the songs are in the projection and what they are
df_top['hover_text'] = df_top.apply(
lambda x:
f'<br>'+
f'Full Song: <a href="{x["external_urls.spotify"]}">Play</a><br>' +
f'Album: {x["album.name"]}<br>' +
# f'Genres: {x["artists.genres"]}<br>'+
# f'Super Genres: {x["artists.supergenres"]}<br>' +
f'Super Genre 1: {x["artists.supergenre_1"]}<br>',
axis=1
)
df_top['fullname'] = df_top.apply(
lambda x: f'{x["name"]}, {x["artists.name"]}<br>',
axis=1
)
df_top['size'] = df_top['popularity'].apply(lambda x: np.log10(x+1))
sns.pairplot(
df_top[feat+['clusters']],
hue='clusters',
diag_kind='kde',
plot_kws=dict(edgecolor=None),
corner=True,
);
After I projected my 4000+ liked songs in my Spotify playlist, I projected it with UMAP and clustered with HDBSCAN. Those are the top3 most popular songs in each of the clusters.
Below is a plot done with Plotly, where I embedded a link to a song preview file.
As such you can as well hover over the points to know which song it is or click on it to listen to it.
from plotly.offline import download_plotlyjs, init_notebook_mode
import plotly.express as px
init_notebook_mode(connected=True)
fig = px.scatter(
df_top,
x='proj_x',
y='proj_y',
hover_name='fullname',
hover_data=['hover_text'],
text=df_top['preview_url'].map(lambda x: f'<a href="{x}">Play</a>'),
size='size',
color='clusters',
color_discrete_sequence=px.colors.qualitative.Safe,
)
fig.update_layout(
autosize=False,
width=800,
height=600,
)